doctra 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doctra/cli/main.py CHANGED
@@ -9,6 +9,7 @@ detection results, and analyze document structure from the command line.
9
9
  import click
10
10
  import os
11
11
  import sys
12
+ import traceback
12
13
  from pathlib import Path
13
14
  from typing import Optional
14
15
 
@@ -25,6 +26,10 @@ except ImportError:
25
26
  from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
26
27
  from doctra.parsers.table_chart_extractor import ChartTablePDFParser
27
28
 
29
+ # Import additional modules
30
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
31
+ from doctra.engines.image_restoration import DocResEngine
32
+
28
33
 
29
34
  @click.group(invoke_without_command=True)
30
35
  @click.pass_context
@@ -247,7 +252,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
247
252
  except Exception as e:
248
253
  click.echo(f"❌ Error initializing parser: {e}", err=True)
249
254
  if verbose:
250
- import traceback
251
255
  click.echo(traceback.format_exc(), err=True)
252
256
  sys.exit(1)
253
257
 
@@ -271,7 +275,6 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
271
275
  except Exception as e:
272
276
  click.echo(f"❌ Error during parsing: {e}", err=True)
273
277
  if verbose:
274
- import traceback
275
278
  click.echo(traceback.format_exc(), err=True)
276
279
  sys.exit(1)
277
280
  finally:
@@ -394,7 +397,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
394
397
  except Exception as e:
395
398
  click.echo(f"❌ Error initializing enhanced parser: {e}", err=True)
396
399
  if verbose:
397
- import traceback
398
400
  click.echo(traceback.format_exc(), err=True)
399
401
  sys.exit(1)
400
402
 
@@ -418,7 +420,6 @@ def enhance(pdf_path: Path, output_dir: Optional[Path], restoration_task: str,
418
420
  except Exception as e:
419
421
  click.echo(f"❌ Error during enhanced parsing: {e}", err=True)
420
422
  if verbose:
421
- import traceback
422
423
  click.echo(traceback.format_exc(), err=True)
423
424
  sys.exit(1)
424
425
  finally:
@@ -526,7 +527,6 @@ def charts(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
526
527
  except Exception as e:
527
528
  click.echo(f"❌ Error during chart extraction: {e}", err=True)
528
529
  if verbose:
529
- import traceback
530
530
  click.echo(traceback.format_exc(), err=True)
531
531
  sys.exit(1)
532
532
 
@@ -604,7 +604,6 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
604
604
  except Exception as e:
605
605
  click.echo(f"❌ Error during table extraction: {e}", err=True)
606
606
  if verbose:
607
- import traceback
608
607
  click.echo(traceback.format_exc(), err=True)
609
608
  sys.exit(1)
610
609
 
@@ -683,7 +682,6 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
683
682
  except Exception as e:
684
683
  click.echo(f"❌ Error during extraction: {e}", err=True)
685
684
  if verbose:
686
- import traceback
687
685
  click.echo(traceback.format_exc(), err=True)
688
686
  sys.exit(1)
689
687
 
@@ -772,7 +770,6 @@ def visualize(pdf_path: Path, pages: int, columns: int, width: int,
772
770
  except Exception as e:
773
771
  click.echo(f"❌ Error creating visualization: {e}", err=True)
774
772
  if verbose:
775
- import traceback
776
773
  click.echo(traceback.format_exc(), err=True)
777
774
  sys.exit(1)
778
775
 
@@ -805,7 +802,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
805
802
  click.echo(f"🔍 Analyzing: {pdf_path.name}")
806
803
 
807
804
  # Create layout engine for analysis only
808
- from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
809
805
 
810
806
  if verbose:
811
807
  click.echo(f" Using model: {layout_model}")
@@ -903,7 +899,6 @@ def analyze(pdf_path: Path, dpi: int, min_score: float, layout_model: str, verbo
903
899
  except Exception as e:
904
900
  click.echo(f"❌ Error analyzing PDF: {e}", err=True)
905
901
  if verbose:
906
- import traceback
907
902
  click.echo(traceback.format_exc(), err=True)
908
903
  sys.exit(1)
909
904
 
@@ -922,7 +917,6 @@ def info():
922
917
  click.echo("=" * 50)
923
918
 
924
919
  # Check Python version
925
- import sys
926
920
  python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
927
921
  click.echo(f"Python version: {python_version}")
928
922
 
@@ -1003,7 +997,6 @@ def info():
1003
997
  # DocRes information
1004
998
  click.echo("\nDocRes Image Restoration:")
1005
999
  try:
1006
- from doctra.engines.image_restoration import DocResEngine
1007
1000
  docres = DocResEngine()
1008
1001
  click.echo(f" ✅ DocRes available - {len(docres.get_supported_tasks())} restoration tasks")
1009
1002
  click.echo(" Tasks: dewarping, deshadowing, appearance, deblurring, binarization, end2end")
doctra/cli/utils.py CHANGED
@@ -7,8 +7,10 @@ different CLI commands.
7
7
 
8
8
  import click
9
9
  import sys
10
+ import traceback
10
11
  from typing import Optional, Dict, Any
11
12
  from pathlib import Path
13
+ from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
12
14
 
13
15
 
14
16
  def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
@@ -58,7 +60,6 @@ def handle_exception(e: Exception, verbose: bool = False) -> None:
58
60
  """
59
61
  click.echo(f"❌ Error: {e}", err=True)
60
62
  if verbose:
61
- import traceback
62
63
  click.echo(traceback.format_exc(), err=True)
63
64
  sys.exit(1)
64
65
 
@@ -271,8 +272,6 @@ def create_progress_callback(description: str, total: int):
271
272
  :return: Callable progress callback function that takes an integer
272
273
  representing the number of completed items
273
274
  """
274
- import sys
275
- from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
276
275
 
277
276
  # Enhanced environment detection
278
277
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
@@ -18,6 +18,8 @@ import sys
18
18
  import cv2
19
19
  import numpy as np
20
20
  import torch
21
+ import tempfile
22
+ import time
21
23
  from pathlib import Path
22
24
  from typing import Union, List, Tuple, Optional, Dict, Any
23
25
 
@@ -85,12 +87,12 @@ def load_docres_weights_from_hf():
85
87
  if is_notebook:
86
88
  progress_bar = create_notebook_friendly_bar(
87
89
  total=2,
88
- desc="🔄 Downloading DocRes models from Hugging Face Hub"
90
+ desc="Downloading DocRes models from Hugging Face Hub"
89
91
  )
90
92
  else:
91
93
  progress_bar = create_beautiful_progress_bar(
92
94
  total=2,
93
- desc="🔄 Downloading DocRes models from Hugging Face Hub",
95
+ desc="Downloading DocRes models from Hugging Face Hub",
94
96
  leave=True
95
97
  )
96
98
 
@@ -308,8 +310,6 @@ class DocResEngine:
308
310
 
309
311
  def _run_single_task(self, img_array: np.ndarray, task: str, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
310
312
  """Run a single restoration task"""
311
- import tempfile
312
- import time
313
313
 
314
314
  # Create temporary file for inference
315
315
  with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_file:
@@ -322,7 +322,6 @@ class DocResEngine:
322
322
  os.chdir(str(docres_dir))
323
323
 
324
324
  # Set global DEVICE variable that DocRes inference expects
325
- import torch
326
325
  import inference # Import the inference module to set its global DEVICE
327
326
  inference.DEVICE = self.device
328
327
 
@@ -364,8 +363,6 @@ class DocResEngine:
364
363
 
365
364
  def _run_end2end_pipeline(self, img_array: np.ndarray, save_prompts: bool) -> Tuple[np.ndarray, Dict]:
366
365
  """Run the end2end pipeline: dewarping → deshadowing → appearance"""
367
- import tempfile
368
- import time
369
366
 
370
367
  intermediate_steps = {}
371
368
 
@@ -374,7 +371,6 @@ class DocResEngine:
374
371
  os.chdir(str(docres_dir))
375
372
 
376
373
  # Set global DEVICE variable that DocRes inference expects
377
- import torch
378
374
  import inference # Import the inference module to set its global DEVICE
379
375
  inference.DEVICE = self.device
380
376
 
@@ -482,7 +478,6 @@ class DocResEngine:
482
478
  """
483
479
  try:
484
480
  from PIL import Image
485
- import numpy as np
486
481
  from doctra.utils.pdf_io import render_pdf_to_images
487
482
 
488
483
  # Generate output path if not provided
@@ -510,12 +505,12 @@ class DocResEngine:
510
505
  if is_notebook:
511
506
  progress_bar = create_notebook_friendly_bar(
512
507
  total=len(pil_pages),
513
- desc="🔄 Processing pages"
508
+ desc="Processing pages"
514
509
  )
515
510
  else:
516
511
  progress_bar = create_beautiful_progress_bar(
517
512
  total=len(pil_pages),
518
- desc="🔄 Processing pages",
513
+ desc="Processing pages",
519
514
  leave=True
520
515
  )
521
516
 
@@ -1,17 +1,19 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel, Field
2
2
 
3
3
  class Chart(BaseModel):
4
4
  """
5
5
  Structured representation of a chart extracted from an image.
6
6
 
7
- Contains the title, headers, and data rows extracted from a chart
8
- using VLM (Vision Language Model) processing.
7
+ Includes a title, a short description, column headers, and data rows
8
+ identified using VLM (Vision Language Model) processing.
9
9
 
10
- :param title: Title or caption of the chart
10
+ :param title: Title or caption of the chart (max 31 characters)
11
+ :param description: Short description of the chart (max 300 characters)
11
12
  :param headers: Column headers for the chart data
12
13
  :param rows: Data rows containing the chart values
13
14
  """
14
- title: str
15
+ title: str = Field(max_length=31)
16
+ description: str = Field(max_length=300)
15
17
  headers: list[str]
16
18
  rows: list[list[str]]
17
19
 
@@ -19,13 +21,15 @@ class Table(BaseModel):
19
21
  """
20
22
  Structured representation of a table extracted from an image.
21
23
 
22
- Contains the title, headers, and data rows extracted from a table
23
- using VLM (Vision Language Model) processing.
24
+ Includes a title, a short description, column headers, and data rows
25
+ identified using VLM (Vision Language Model) processing.
24
26
 
25
- :param title: Title or caption of the table
27
+ :param title: Title or caption of the table (max 31 characters)
28
+ :param description: Short description of the table (max 300 characters)
26
29
  :param headers: Column headers for the table data
27
30
  :param rows: Data rows containing the table values
28
31
  """
29
- title: str
32
+ title: str = Field(max_length=31)
33
+ description: str = Field(max_length=300)
30
34
  headers: list[str]
31
35
  rows: list[list[str]]
@@ -73,7 +73,7 @@ class VLMStructuredExtractor:
73
73
  Extract structured chart data from an image.
74
74
 
75
75
  :param image_path: Path to the chart image file
76
- :return: Chart object containing extracted title, headers, and data rows
76
+ :return: Chart object containing extracted title, description, headers, and data rows
77
77
  :raises Exception: If image processing or VLM extraction fails
78
78
  """
79
79
  prompt_text = (
@@ -81,6 +81,7 @@ class VLMStructuredExtractor:
81
81
  "If the title is not present in the image, generate a suitable title. "
82
82
  "Ensure that the table represents the data from the chart accurately."
83
83
  "The number of columns in the headers must match the number of columns in each row."
84
+ "Also provide a short description (max 300 characters) of the chart."
84
85
  )
85
86
  return self._call(prompt_text, image_path, Chart)
86
87
 
@@ -89,7 +90,7 @@ class VLMStructuredExtractor:
89
90
  Extract structured table data from an image.
90
91
 
91
92
  :param image_path: Path to the table image file
92
- :return: Table object containing extracted title, headers, and data rows
93
+ :return: Table object containing extracted title, description, headers, and data rows
93
94
  :raises Exception: If image processing or VLM extraction fails
94
95
  """
95
96
  prompt_text = (
@@ -97,5 +98,6 @@ class VLMStructuredExtractor:
97
98
  "Provide the headers and rows of the table, ensuring accuracy in the extraction. "
98
99
  "If the title is not present in the image, generate a suitable title."
99
100
  "The number of columns in the headers must match the number of columns in each row."
101
+ "Also provide a short description (max 300 characters) of the table."
100
102
  )
101
103
  return self._call(prompt_text, image_path, Table)
@@ -5,6 +5,7 @@ from typing import Dict, Any, List, Set
5
5
  import pandas as pd # pip install pandas openpyxl
6
6
  from openpyxl.styles import PatternFill, Font, Alignment
7
7
  from openpyxl.utils import get_column_letter
8
+ from openpyxl.worksheet.hyperlink import Hyperlink
8
9
 
9
10
  _INVALID_SHEET_CHARS = r'[:\\/*?\[\]]' # Excel-invalid characters
10
11
  _MAX_SHEET_LEN = 31
@@ -85,6 +86,61 @@ def _autosize_columns(ws, df: pd.DataFrame) -> None:
85
86
  ws.column_dimensions[get_column_letter(i)].width = min(max(10, max_len + 2), 60)
86
87
 
87
88
 
89
+ def _style_summary_sheet(ws, df: pd.DataFrame, sheet_mapping: dict = None) -> None:
90
+ """
91
+ Apply special styling to the summary sheet with text wrapping for descriptions.
92
+ Add hyperlinks to table titles that link to their corresponding sheets.
93
+
94
+ :param ws: OpenPyXL worksheet object to style
95
+ :param df: Pandas DataFrame containing the summary data
96
+ :param sheet_mapping: Dictionary mapping table titles to their sheet names
97
+ :return: None
98
+ """
99
+ # Style header row
100
+ _style_header(ws, ncols=df.shape[1])
101
+
102
+ # Apply text wrapping to all data cells
103
+ wrap_alignment = Alignment(wrap_text=True, vertical="top")
104
+
105
+ # Apply wrapping to all data rows (skip header row)
106
+ for row_idx in range(2, len(df) + 2): # Start from row 2 (after header)
107
+ for col_idx in range(1, df.shape[1] + 1):
108
+ cell = ws.cell(row=row_idx, column=col_idx)
109
+ cell.alignment = wrap_alignment
110
+
111
+ # Add hyperlink to table title column (column A)
112
+ if col_idx == 1 and sheet_mapping: # Table Title column
113
+ table_title = cell.value
114
+ if table_title and table_title in sheet_mapping:
115
+ sheet_name = sheet_mapping[table_title]
116
+
117
+ # Create hyperlink to the sheet using proper Excel format
118
+ # Escape sheet name if it contains spaces or special characters
119
+ if ' ' in sheet_name or any(char in sheet_name for char in ['[', ']', '*', '?', ':', '\\', '/']):
120
+ hyperlink_ref = f"#'{sheet_name}'!A1"
121
+ else:
122
+ hyperlink_ref = f"#{sheet_name}!A1"
123
+
124
+ # Use Hyperlink class with proper parameters
125
+ cell.hyperlink = Hyperlink(ref=hyperlink_ref, target=hyperlink_ref)
126
+ # Style the hyperlink
127
+ cell.font = Font(color="0000FF", underline="single")
128
+
129
+ # Set specific column widths for summary sheet
130
+ # Table Title column - narrower
131
+ ws.column_dimensions['A'].width = 30
132
+ # Description column - wider to accommodate wrapped text
133
+ ws.column_dimensions['B'].width = 60
134
+ # Page column - narrow for page numbers
135
+ ws.column_dimensions['C'].width = 10
136
+ # Type column - narrow for Table/Chart
137
+ ws.column_dimensions['D'].width = 12
138
+
139
+ # Set row heights to accommodate wrapped text
140
+ for row_idx in range(2, len(df) + 2):
141
+ ws.row_dimensions[row_idx].height = 60 # Allow for multiple lines
142
+
143
+
88
144
  def _normalize_data(headers: List[str], rows: List[List]) -> tuple[List[str], List[List]]:
89
145
  """
90
146
  Normalize headers and rows to ensure consistent dimensions.
@@ -159,6 +215,31 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
159
215
  taken: Set[str] = set()
160
216
 
161
217
  with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
218
+ # Create summary sheet first
219
+ summary_data = []
220
+ sheet_mapping = {} # Map table titles to their sheet names
221
+
222
+ for item in valid_items:
223
+ title = item.get("title") or "Untitled"
224
+ description = item.get("description") or "No description available"
225
+ page_number = item.get("page", "Unknown")
226
+ item_type = item.get("type", "Table") # Default to "Table" if not specified
227
+
228
+
229
+ summary_data.append({
230
+ "Table Title": title,
231
+ "Description": description,
232
+ "Page": page_number,
233
+ "Type": item_type
234
+ })
235
+
236
+ # Create summary sheet first (but without hyperlinks initially)
237
+ if summary_data:
238
+ summary_df = pd.DataFrame(summary_data)
239
+ summary_df.to_excel(writer, sheet_name="Table Summary", index=False)
240
+ taken.add("Table Summary")
241
+
242
+ # Process individual table sheets to build sheet mapping
162
243
  for item in valid_items:
163
244
  try:
164
245
  title = item.get("title") or "Untitled"
@@ -166,6 +247,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
166
247
  rows = item.get("rows") or []
167
248
 
168
249
  sheet_name = _safe_sheet_name(title, taken)
250
+
251
+ # Add to sheet mapping for hyperlinks
252
+ sheet_mapping[title] = sheet_name
169
253
 
170
254
  # Normalize data to handle mismatched dimensions
171
255
  normalized_headers, normalized_rows = _normalize_data(headers, rows)
@@ -194,4 +278,9 @@ def write_structured_excel(excel_path: str, items: List[Dict[str, Any]]) -> str
194
278
  print(f"Error processing item '{item.get('title', 'Unknown')}': {e}")
195
279
  continue
196
280
 
281
+ # Now add hyperlinks to the summary sheet (after all sheets are created)
282
+ if summary_data and sheet_mapping:
283
+ summary_ws = writer.sheets["Table Summary"]
284
+ _style_summary_sheet(summary_ws, summary_df, sheet_mapping)
285
+
197
286
  return excel_path
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
  import os
3
3
  import re
4
4
  import base64
5
- from typing import List, Dict, Any
5
+ from typing import List, Dict, Any, Optional
6
6
  from markdown_it import MarkdownIt
7
7
 
8
8
 
@@ -64,6 +64,114 @@ def _process_image_paths(md_content: str, out_dir: str) -> str:
64
64
  return processed_content
65
65
 
66
66
 
67
+ def write_html_from_lines(html_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
68
+ """
69
+ Convert HTML lines directly into a single HTML file and save it.
70
+
71
+ This function is used when VLM is enabled to ensure proper HTML table formatting
72
+ instead of markdown-to-HTML conversion.
73
+
74
+ :param html_lines: List of HTML strings to join into a single file
75
+ :param out_dir: Directory where the HTML file will be saved
76
+ :param filename: Name of the HTML file (default: "result.html")
77
+ :return: The absolute path of the written HTML file
78
+ """
79
+ os.makedirs(out_dir, exist_ok=True)
80
+
81
+ # Join HTML lines and clean up excessive blank lines
82
+ html_content = "\n".join(html_lines).strip() + "\n"
83
+ html_content = re.sub(r"\n{3,}", "\n\n", html_content)
84
+
85
+ # Process image paths to convert relative paths to absolute paths or base64
86
+ html_content = _process_image_paths(html_content, out_dir)
87
+
88
+ # Always apply table styling to ensure all tables are properly formatted
89
+ html_content = _add_table_styling(html_content)
90
+
91
+ # Create complete HTML document with modern styling
92
+ html_document = f"""<!DOCTYPE html>
93
+ <html lang="en">
94
+ <head>
95
+ <meta charset="UTF-8">
96
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
97
+ <title>Document Analysis Results</title>
98
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
99
+ <style>
100
+ {_get_css_styles()}
101
+ </style>
102
+ </head>
103
+ <body>
104
+ <button class="theme-toggle" onclick="toggleTheme()" title="Toggle dark mode"></button>
105
+ <div class="container">
106
+ <header class="header">
107
+ <div class="header-content">
108
+ <div class="header-text">
109
+ <h1>Document Analysis Results</h1>
110
+ <p class="subtitle">Intelligent Document Processing & Analysis</p>
111
+ </div>
112
+ <div class="header-badge">
113
+ Generated by Doctra
114
+ </div>
115
+ </div>
116
+ </header>
117
+ <main class="content">
118
+ {html_content}
119
+ </main>
120
+ <footer class="footer">
121
+ <div class="footer-content">
122
+ <div class="footer-brand">Doctra</div>
123
+ <div class="footer-info">
124
+ <span>Intelligent Document Processing</span>
125
+ <a href="https://github.com/AdemBoukhris457/Doctra" target="_blank">GitHub</a>
126
+ </div>
127
+ </div>
128
+ </footer>
129
+ </div>
130
+ <script>
131
+ // Theme toggle functionality
132
+ function toggleTheme() {{
133
+ const body = document.body;
134
+ const currentTheme = body.getAttribute('data-theme');
135
+ const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
136
+
137
+ body.setAttribute('data-theme', newTheme);
138
+ localStorage.setItem('doctra-theme', newTheme);
139
+
140
+ // Add smooth transition
141
+ body.style.transition = 'all 0.3s ease';
142
+ setTimeout(() => {{
143
+ body.style.transition = '';
144
+ }}, 300);
145
+ }}
146
+
147
+ // Load saved theme on page load
148
+ document.addEventListener('DOMContentLoaded', function() {{
149
+ const savedTheme = localStorage.getItem('doctra-theme') || 'light';
150
+ document.body.setAttribute('data-theme', savedTheme);
151
+ }});
152
+
153
+ // Add smooth scroll behavior
154
+ document.documentElement.style.scrollBehavior = 'smooth';
155
+
156
+ // Add loading animation
157
+ window.addEventListener('load', function() {{
158
+ document.body.style.opacity = '0';
159
+ document.body.style.transition = 'opacity 0.5s ease';
160
+ setTimeout(() => {{
161
+ document.body.style.opacity = '1';
162
+ }}, 100);
163
+ }});
164
+ </script>
165
+ </body>
166
+ </html>"""
167
+
168
+ html_path = os.path.join(out_dir, filename)
169
+ with open(html_path, "w", encoding="utf-8") as f:
170
+ f.write(html_document)
171
+
172
+ return os.path.abspath(html_path)
173
+
174
+
67
175
  def write_html(md_lines: List[str], out_dir: str, filename: str = "result.html") -> str:
68
176
  """
69
177
  Convert collected Markdown lines into a single HTML file and save it.
@@ -414,6 +522,54 @@ def _create_html_table(headers: List[str], rows: List[List]) -> str:
414
522
  """
415
523
 
416
524
 
525
+ def render_html_table(
526
+ headers: List[str] | None,
527
+ rows: List[List[str]] | None,
528
+ title: Optional[str] = None,
529
+ ) -> str:
530
+ """
531
+ Render an HTML table from headers, rows, and optional title.
532
+
533
+ Creates a properly formatted HTML table with headers, data rows,
534
+ and optional title. This is used for VLM-extracted tables to ensure
535
+ they display as proper HTML tables instead of markdown.
536
+
537
+ :param headers: List of column headers (optional, will be auto-generated if None)
538
+ :param rows: List of data rows, where each row is a list of cell values
539
+ :param title: Optional title to display above the table
540
+ :return: Formatted HTML table string
541
+ """
542
+ headers = headers or []
543
+ rows = rows or []
544
+
545
+ if not headers and not rows:
546
+ return "<p class='no-data'>No data available</p>"
547
+
548
+ # Determine width
549
+ width = len(headers) if headers else (max((len(r) for r in rows), default=1))
550
+
551
+ # Generate headers if not provided
552
+ if not headers:
553
+ headers = [f"Column {i+1}" for i in range(width)]
554
+
555
+ # Normalize data to handle mismatched dimensions
556
+ normalized_headers, normalized_rows = _normalize_data(headers, rows)
557
+
558
+ # Create HTML table
559
+ table_html = _create_html_table(normalized_headers, normalized_rows)
560
+
561
+ # Add title if provided
562
+ if title:
563
+ return f"""
564
+ <div class="table-section">
565
+ <h3 class="table-title">{_escape_html(title)}</h3>
566
+ {table_html}
567
+ </div>
568
+ """
569
+ else:
570
+ return table_html
571
+
572
+
417
573
  def _add_table_styling(html_content: str) -> str:
418
574
  """
419
575
  Add table styling wrapper to HTML content.
@@ -884,6 +1040,55 @@ def _get_css_styles() -> str:
884
1040
  content: '☀️';
885
1041
  }
886
1042
 
1043
+ /* Dark mode table styles */
1044
+ [data-theme="dark"] .markdown-table,
1045
+ [data-theme="dark"] table {
1046
+ background: var(--card-bg);
1047
+ border-color: var(--border-color);
1048
+ }
1049
+
1050
+ [data-theme="dark"] .markdown-table th,
1051
+ [data-theme="dark"] table th {
1052
+ background: #374151;
1053
+ color: #f9fafb;
1054
+ border-bottom-color: var(--accent-color);
1055
+ }
1056
+
1057
+ [data-theme="dark"] .markdown-table td,
1058
+ [data-theme="dark"] table td {
1059
+ color: #f9fafb;
1060
+ border-bottom-color: var(--border-color);
1061
+ }
1062
+
1063
+ [data-theme="dark"] .markdown-table tr:nth-child(even),
1064
+ [data-theme="dark"] table tr:nth-child(even) {
1065
+ background: #374151;
1066
+ }
1067
+
1068
+ [data-theme="dark"] .markdown-table tr:hover,
1069
+ [data-theme="dark"] table tr:hover {
1070
+ background: #4b5563;
1071
+ }
1072
+
1073
+ /* Dark mode footer styles to match header */
1074
+ [data-theme="dark"] .footer {
1075
+ background: var(--primary-color);
1076
+ color: white;
1077
+ border-top-color: var(--accent-color);
1078
+ }
1079
+
1080
+ [data-theme="dark"] .footer-brand {
1081
+ color: white;
1082
+ }
1083
+
1084
+ [data-theme="dark"] .footer a {
1085
+ color: rgba(255, 255, 255, 0.8);
1086
+ }
1087
+
1088
+ [data-theme="dark"] .footer a:hover {
1089
+ color: white;
1090
+ }
1091
+
887
1092
  /* Professional scrollbar */
888
1093
  ::-webkit-scrollbar {
889
1094
  width: 8px;